Package org.wltea.analyzer.lucene

Source Code of org.wltea.analyzer.lucene.IKQueryParser$TokenBranch

/**
*
*/
package org.wltea.analyzer.lucene;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause.Occur;

import org.wltea.analyzer.IKSegmentation;
import org.wltea.analyzer.Lexeme;

/**
* IK查询分析器
* 实现了对分词歧义结果的非冲突排列组合
* 有效的优化对歧义关键词的搜索命中
* 针对IK Analyzer V3的优化实现
*
* @author 林良益
*
*/
public final class IKQueryParser {
 
 
  //查询关键字解析缓存线程本地变量
  private static ThreadLocal<Map<String , TokenBranch>> keywordCacheThreadLocal
      = new ThreadLocal<Map<String , TokenBranch>>();
 
 
  //是否采用最大词长分词
  private static boolean isMaxWordLength = false;

  /**
   * 设置分词策略
   * isMaxWordLength = true 采用最大词长分词
   * @param isMaxWordLength
   */
  public static void setMaxWordLength(boolean isMaxWordLength) {
    IKQueryParser.isMaxWordLength = isMaxWordLength ;
  }
 
  /**
   * 优化query队列
   * 减少Query表达式的嵌套
   * @param queries
   * @return
   */
  private static Query optimizeQueries(List<Query> queries){ 
    //生成当前branch 的完整query
    if(queries.size() == 0){
      return null;
    }else if(queries.size() == 1){
      return queries.get(0);
    }else{
      BooleanQuery mustQueries = new BooleanQuery();
      for(Query q : queries){
        mustQueries.add(q, Occur.MUST);
      }
      return mustQueries;
    }     
  }
 
  /**
   * 获取线程本地的解析缓存
   * @return
   */
  private static Map<String , TokenBranch> getTheadLocalCache(){
    Map<String , TokenBranch> keywordCache = keywordCacheThreadLocal.get();
    if(keywordCache == null){
       keywordCache = new HashMap<String , TokenBranch>(4);
       keywordCacheThreadLocal.set(keywordCache);
    }
    return keywordCache;
  }
 
  /**
   * 缓存解析结果的博弈树
   * @param query
   * @return
   */
  private static TokenBranch getCachedTokenBranch(String query){
    Map<String , TokenBranch> keywordCache = getTheadLocalCache();
    return keywordCache.get(query);
  }
 
  /**
   * 缓存解析结果的博弈树
   * @param query
   * @return
   */
  private static void cachedTokenBranch(String query , TokenBranch tb){
    Map<String , TokenBranch> keywordCache = getTheadLocalCache();
    keywordCache.put(query, tb);
  }
   
 
  /**
   * 单连续字窜(不带空格符)单Field查询分析
   * @param field
   * @param query
   * @return
   * @throws IOException
   */
  private static Query _parse(String field , String query) throws IOException{
    if(field == null){
      throw new IllegalArgumentException("parameter \"field\" is null");
    }

    if(query == null || "".equals(query.trim())){
      return new TermQuery(new Term(field));
    }
   
    //从缓存中取出已经解析的query生产的TokenBranch
    TokenBranch root = getCachedTokenBranch(query);
    if(root != null){
      return optimizeQueries(root.toQueries(field));
    }else{
      //System.out.println(System.currentTimeMillis());
      root = new TokenBranch(null);   
      //对查询条件q进行分词
      StringReader input = new StringReader(query.trim());
      IKSegmentation ikSeg = new IKSegmentation(input , isMaxWordLength);
      for(Lexeme lexeme = ikSeg.next() ; lexeme != null ; lexeme = ikSeg.next()){
        //处理词元分支
        root.accept(lexeme);
      }
      //缓存解析结果的博弈树
      cachedTokenBranch(query , root);
      return optimizeQueries(root.toQueries(field));
    }
  }
 
  /**
   * 单条件,单Field查询分析
   * @param field -- Document field name
   * @param query -- keyword
   * @return Query 查询逻辑对象
   * @throws IOException
   */
  public static Query parse(String field , String query) throws IOException{
    if(field == null){
      throw new IllegalArgumentException("parameter \"field\" is null");
    }
    String[] qParts = query.split("\\s");
    if(qParts.length > 1){     
      BooleanQuery resultQuery = new BooleanQuery();
      for(String q : qParts){
        //过滤掉由于连续空格造成的空字串
        if("".equals(q)){
          continue;
        }
        Query partQuery = _parse(field , q);
        if(partQuery != null &&
                  (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
          resultQuery.add(partQuery, Occur.SHOULD);
        }
      }
      return resultQuery;
    }else{
      return _parse(field , query);
    }
  }
 
  /**
   * 多Field,单条件查询分析
   * @param fields -- Document fields name
   * @param query  -- keyword
   * @return Query 查询逻辑对象
   * @throws IOException
   */
  public static Query parseMultiField(String[] fields , String query) throws IOException{
    if(fields == null){
      throw new IllegalArgumentException("parameter \"fields\" is null");
    }   
    BooleanQuery resultQuery = new BooleanQuery();   
    for(String field : fields){
      if(field != null){
        Query partQuery = parse(field , query);
        if(partQuery != null &&
                  (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
          resultQuery.add(partQuery, Occur.SHOULD);
        }
      }     
    }   
    return resultQuery;
  }
 
  /**
   * 多Field,单条件,多Occur查询分析
   * @param fields -- Document fields name
   * @param query  -- keyword
   * @param flags -- BooleanClause
   * @return Query 查询逻辑对象
   * @throws IOException
   */
  public static Query parseMultiField(String[] fields , String query ,  BooleanClause.Occur[] flags) throws IOException{
    if(fields == null){
      throw new IllegalArgumentException("parameter \"fields\" is null");
    }
    if(flags == null){
      throw new IllegalArgumentException("parameter \"flags\" is null");
    }
   
    if (flags.length != fields.length){
          throw new IllegalArgumentException("flags.length != fields.length");
    }   
   
    BooleanQuery resultQuery = new BooleanQuery();   
    for(int i = 0; i < fields.length; i++){
      if(fields[i] != null){
        Query partQuery = parse(fields[i] , query);
        if(partQuery != null &&
                  (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
          resultQuery.add(partQuery, flags[i]);
        }
      }     
    }   
    return resultQuery;
  }
 
  /**
   * 多Field多条件查询分析
   * @param fields
   * @param queries
   * @return Query 查询逻辑对象
   * @throws IOException
   */
  public static Query parseMultiField(String[] fields , String[] queries) throws IOException{
    if(fields == null){
      throw new IllegalArgumentException("parameter \"fields\" is null");
    }       
    if(queries == null){
      throw new IllegalArgumentException("parameter \"queries\" is null");
    }       
    if (queries.length != fields.length){
          throw new IllegalArgumentException("queries.length != fields.length");
    }
    BooleanQuery resultQuery = new BooleanQuery();   
    for(int i = 0; i < fields.length; i++){
      if(fields[i] != null){
        Query partQuery = parse(fields[i] , queries[i]);
        if(partQuery != null &&
                  (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
          resultQuery.add(partQuery, Occur.SHOULD);
        }
      }     
    }   
    return resultQuery;
  }

  /**
   * 多Field,多条件,多Occur查询分析
   * @param fields
   * @param queries
   * @param flags
   * @return Query 查询逻辑对象
   * @throws IOException
   */
  public static Query parseMultiField(String[] fields , String[] queries , BooleanClause.Occur[] flags) throws IOException{
    if(fields == null){
      throw new IllegalArgumentException("parameter \"fields\" is null");
    }       
    if(queries == null){
      throw new IllegalArgumentException("parameter \"queries\" is null");
    }
    if(flags == null){
      throw new IllegalArgumentException("parameter \"flags\" is null");
    }
   
      if (!(queries.length == fields.length && queries.length == flags.length)){
          throw new IllegalArgumentException("queries, fields, and flags array have have different length");
      }

      BooleanQuery resultQuery = new BooleanQuery();   
    for(int i = 0; i < fields.length; i++){
      if(fields[i] != null){
        Query partQuery = parse(fields[i] , queries[i]);
        if(partQuery != null &&
                  (!(partQuery instanceof BooleanQuery) || ((BooleanQuery)partQuery).getClauses().length>0)){
          resultQuery.add(partQuery, flags[i]);
        }
      }     
    }   
    return resultQuery;
 
  /**
   * 词元分支
   * 当分词出现歧义时,采用词元分支容纳不同的歧义组合
   * @author 林良益
   *
   */
  private static class TokenBranch{
   
    private static final int REFUSED = -1;
    private static final int ACCEPTED = 0;
    private static final int TONEXT = 1;
   
    //词元分支左边界
    private int leftBorder;
    //词元分支右边界
    private int rightBorder;
    //当前分支主词元
    private Lexeme lexeme;
    //当前分支可并入的词元分支
    private List<TokenBranch> acceptedBranchs;
    //当前分支的后一个相邻分支
    private TokenBranch nextBranch;
   
    TokenBranch(Lexeme lexeme){
      if(lexeme != null){
        this.lexeme = lexeme;
        //初始化branch的左右边界
        this.leftBorder = lexeme.getBeginPosition();
        this.rightBorder = lexeme.getEndPosition();
      }
    }
   
    public int getLeftBorder() {
      return leftBorder;
    }

    public int getRightBorder() {
      return rightBorder;
    }

    public Lexeme getLexeme() {
      return lexeme;
    }

    public List<TokenBranch> getAcceptedBranchs() {
      return acceptedBranchs;
    }

    public TokenBranch getNextBranch() {
      return nextBranch;
    }

    public int hashCode(){
      if(this.lexeme == null){
        return 0;
      }else{
        return this.lexeme.hashCode() * 37;
      }
    }
   
    public boolean equals(Object o){     
      if(o == null){
        return false;
      }   
      if(this == o){
        return true;
      }
      if(o instanceof TokenBranch){
        TokenBranch other = (TokenBranch)o;
        if(this.lexeme == null ||
            other.getLexeme() == null){
          return false;
        }else{
          return this.lexeme.equals(other.getLexeme());
        }
      }else{
        return false;
      }     
    } 
   
    /**
     * 组合词元分支
     * @param _lexeme
     * @return 返回当前branch能否接收词元对象
     */
    boolean accept(Lexeme _lexeme){
     
      /*
       * 检查新的lexeme 对当前的branch 的可接受类型
       * acceptType : REFUSED  不能接受
       * acceptType : ACCEPTED 接受
       * acceptType : TONEXT   由相邻分支接受
       */     
      int acceptType = checkAccept(_lexeme);     
      switch(acceptType){
      case REFUSED:
        // REFUSE 情况
        return false;
       
      case ACCEPTED :
        if(acceptedBranchs == null){
          //当前branch没有子branch,则添加到当前branch下
          acceptedBranchs = new ArrayList<TokenBranch>(2);
          acceptedBranchs.add(new TokenBranch(_lexeme));         
        }else{
          boolean acceptedByChild = false;
          //当前branch拥有子branch,则优先由子branch接纳
          for(TokenBranch childBranch : acceptedBranchs){
            acceptedByChild = childBranch.accept(_lexeme) || acceptedByChild;
          }
          //如果所有的子branch不能接纳,则由当前branch接纳
          if(!acceptedByChild){
            acceptedBranchs.add(new TokenBranch(_lexeme));
          }         
        }
        //设置branch的最大右边界
        if(_lexeme.getEndPosition() > this.rightBorder){
          this.rightBorder = _lexeme.getEndPosition();
        }
        break;
       
      case TONEXT :
        //把lexeme放入当前branch的相邻分支
        if(this.nextBranch == null){
          //如果还没有相邻分支,则建立一个不交叠的分支
          this.nextBranch = new TokenBranch(null);
        }
        this.nextBranch.accept(_lexeme);
        break;
      }

      return true;
    }
   
    /**
     * 将分支数据转成Query逻辑
     * @return
     */
    List<Query> toQueries(String fieldName){     
      List<Query> queries = new ArrayList<Query>(1);     
       //生成当前branch 的query
      if(lexeme != null){
        queries.add(new TermQuery(new Term(fieldName , lexeme.getLexemeText())));
      }     
      //生成child branch 的query
      if(acceptedBranchs != null && acceptedBranchs.size() > 0){
        if(acceptedBranchs.size() == 1){
          Query onlyOneQuery = optimizeQueries(acceptedBranchs.get(0).toQueries(fieldName));
          if(onlyOneQuery != null){
            queries.add(onlyOneQuery);
          }         
        }else{
          BooleanQuery orQuery = new BooleanQuery();
          for(TokenBranch childBranch : acceptedBranchs){
            Query childQuery = optimizeQueries(childBranch.toQueries(fieldName));
            if(childQuery != null){
              orQuery.add(childQuery, Occur.SHOULD);
            }
          }
          if(orQuery.getClauses().length > 0){
            queries.add(orQuery);
          }
        }
      }     
      //生成nextBranch的query
      if(nextBranch != null){       
        queries.addAll(nextBranch.toQueries(fieldName));
      }
      return queries; 
    }
   
    /**
     * 判断指定的lexeme能否被当前的branch接受
     * @param lexeme
     * @return 返回接受的形式
     */
    private int checkAccept(Lexeme _lexeme){
      int acceptType = 0;
     
      if(_lexeme == null){
        throw new IllegalArgumentException("parameter:lexeme is null");
      }
     
      if(null == this.lexeme){//当前的branch是一个不交叠(ROOT)的分支
        if(this.rightBorder > //说明当前branch内至少有一个lexeme
            && _lexeme.getBeginPosition() >= this.rightBorder){
          //_lexeme 与 当前的branch不相交
          acceptType = TONEXT;
        }else{
          acceptType = ACCEPTED;
        }       
      }else{//当前的branch是一个有交叠的分支
       
        if(_lexeme.getBeginPosition() < this.lexeme.getBeginPosition()){
          //_lexeme 的位置比 this.lexeme还靠前(这种情况不应该发生)
          acceptType = REFUSED;
        }else if(_lexeme.getBeginPosition() >= this.lexeme.getBeginPosition()
              && _lexeme.getBeginPosition() < this.lexeme.getEndPosition()){
          // _lexeme 与 this.lexeme相交
          acceptType = REFUSED;
        }else if(_lexeme.getBeginPosition() >= this.lexeme.getEndPosition()
              && _lexeme.getBeginPosition() < this.rightBorder){
          //_lexeme 与 this.lexeme 不相交, 但_lexeme 与 当前的branch相交
          acceptType = ACCEPTED;
        }else{//_lexeme.getBeginPosition() >= this.rightBorder
          //_lexeme 与 当前的branch不相交
          acceptType=  TONEXT;
        }
      }
      return acceptType;
    }
 
  }
}
TOP

Related Classes of org.wltea.analyzer.lucene.IKQueryParser$TokenBranch

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.